1 Setup

This section and the next are relevant for reproducibility purposes. For results, please skip to section 3 (Quality Control)

  • Libraries
suppressPackageStartupMessages({
  library(data.table)
  library(DESeq2)
  library(gplots)
  library(here)
  library(hyperSpec)
  library(magrittr)
  library(parallel)
  library(plotly)
  library(pvclust)
  library(tidyverse)
  library(tximport)
  library(vsn)
  library(emoji)
})
  • Helper functions
source(here("UPSCb-common/src/R/featureSelection.R"))
  • Graphics
hpal <- colorRampPalette(c("blue","white","red"))(100)

2 Data

  • Sample information
samples <- read_csv(here("data/drought_roots.csv"),
                      col_types=cols(.default=col_factor()))
  • tx2gene translation table
tx2gene <- suppressMessages(read_delim(here("reference/annotation/tx2gene_update.tsv.gz"), delim="\t", col_names=c("TXID","GENE"), skip=1))
  • Raw data
filelist <- list.files(here("results/Salmon"), 
                          recursive = TRUE, 
                          pattern = "quant.sf",
                          full.names = TRUE)
  • Sanity check to ensure that the data is sorted according to the sample info
stopifnot(all(match(sub("[1-3]_151118_BC852HANXX_P2503_", "", sub("*_sortmerna_trimmomatic","",basename(dirname(filelist)))),
                    samples$SciLifeID) == 1:nrow(samples)))

samples_rep <-rbind(samples,samples)
samples_rep <- rbind(samples_rep,samples)
  • add filelist to samples as a new column
samples_rep %<>% mutate(Filenames = filelist)
samples_rep$BioID <- sub("[1-3]_151118_BC852HANXX_", "", sub("*_sortmerna_trimmomatic","",basename(dirname(samples_rep$Filenames))))
  • Read the expression at the gene level
txi <- suppressMessages(tximport(files = samples_rep$Filenames,
                                 type = "salmon",
                                 tx2gene=tx2gene,
                                 countsFromAbundance="lengthScaledTPM"))
counts <- txi$counts

counts <- do.call(
  cbind,
  lapply(split.data.frame(t(counts),
                          samples$SciLifeID),
         colSums))

csamples <- samples[,-1]
csamples <- csamples[match(colnames(counts),csamples$SciLifeID),]
colnames(counts) <- csamples$SciLifeID

👉 We’ve generated counts from abundances using the average transcript length, averaged over samples and to library size


 

3 Quality Control

3.1 “Not expressed” genes

sel <- rowSums(counts) == 0
sprintf("%s%% percent (%s) of %s genes are not expressed",
        round(sum(sel) * 100/ nrow(counts),digits=1),
        sum(sel),
        nrow(counts))
## [1] "8.3% percent (3627) of 43909 genes are not expressed"

3.2 Sequencing depth

  • Let us take a look at the sequencing depth, colouring by Level
dat <- tibble(x=colnames(counts),y=colSums(counts)) %>% 
  bind_cols(csamples)

ggplot(dat,aes(x,y,fill=Level)) + 
  geom_col() + 
  scale_y_continuous(name="reads") +
  facet_grid(~ factor(Level), scales = "free") +
  theme_bw() + 
  theme(axis.text.x=element_text(angle=90,size=4), axis.title.x=element_blank()) +
  labs(title ="Sample sequencing depth")

👉 There is not a big difference in the raw sequencing depth

3.3 Per-gene mean expression

i.e. the mean raw count of every gene across samples is calculated and displayed on a log10 scale.

ggplot(data.frame(value=log10(rowMeans(counts))),aes(x=value)) + 
  geom_density(na.rm = TRUE) +
  ggtitle("Gene mean raw counts distribution") +
  scale_x_continuous(name="mean raw counts (log10)") + 
  theme_bw()

👉 The per-gene mean expression is as expected since the highest peak is around 2

3.4 Per-sample expression

dat <- as.data.frame(log10(counts)) %>% 
  utils::stack() %>% 
  mutate(Level=samples_rep$Level[match(ind,csamples$SciLifeID)]) 


ggplot(dat,aes(x=values,group=ind,col=Level)) + 
  geom_density(na.rm = TRUE) + 
  ggtitle("Sample raw counts distribution") +
  scale_x_continuous(name="per gene raw counts (log10)") + 
  theme_bw()

👉 All samples have the same sequencing depth characteristics and there is no deviation when we look at one or the other condition

  • Export raw expression data
dir.create(here("data/analysis/salmon"),showWarnings=FALSE,recursive=TRUE)
write.csv(counts,file=here("data/analysis/salmon/raw-unormalised-gene-expression_data_merge.csv"))

 

4 Data normalisation

4.1 Preparation

For visualization, the data is submitted to a variance stabilization transformation using DESeq2. The dispersion is estimated independently of the sample condition and replicate.

load(file=here("data/analysis/salmon/dds_merge.rda"))

4.2 Size factors

(i.e. the sequencing library size effect)

dds <- estimateSizeFactors(dds) %>% 
  suppressMessages()

boxplot(normalizationFactors(dds),
        main="Sequencing libraries size factor",
        las=2,log="y")
abline(h=1, col = "Red", lty = 3)

and without outliers:

boxplot(normalizationFactors(dds),
        main="Sequencing libraries size factor",
        las=2,log="y", outline=FALSE)
abline(h=1, col = "Red", lty = 3)

👉 The sequencing libraries size factor is a bit variable across samples

4.3 Validation

Let’s look at standard deviations before and after VST normalization. This plot is to see whether there is a dependency of SD on the mean.

Before:

meanSdPlot(log1p(counts(dds))[rowSums(counts(dds))>0,])

After VST normalization, the red line should be almost horizontal which indicates no dependency of variance on mean (homoscedastic).

4.4 Variance Stabilising Transformation

vsd <- varianceStabilizingTransformation(dds, blind=TRUE)
vst <- assay(vsd)
vst <- vst - min(vst)

meanSdPlot(vst[rowSums(vst)>0,])

👉 We can conclude that the variance stabilization worked adequately even if the red line is not perfectly horizontal


 

4.5 QC on the normalised data

4.5.1 PCA

pc <- prcomp(t(vst))
percent <- round(summary(pc)$importance[2,]*100)

4.5.2 Scree plot

We define the number of variables of the model (=1) and the number of possible combinations (=8).

We plot the percentage explained by different components

  • the red line represents number of variables in the model
  • the orange line represents number of variable combinations.
nvar=1
nlevel=nlevels(dds$Level)
ggplot(tibble(x=1:length(percent),y=cumsum(percent)),aes(x=x,y=y)) +
  geom_line() + scale_y_continuous("variance explained (%)",limits=c(0,100)) +
  scale_x_continuous("Principal component") + 
  geom_vline(xintercept=nvar,colour="red",linetype="dashed",size=0.5) + 
  geom_hline(yintercept=cumsum(percent)[nvar],colour="red",linetype="dashed",size=0.5) +
  geom_vline(xintercept=nlevel,colour="orange",linetype="dashed",size=0.5) + 
  geom_hline(yintercept=cumsum(percent)[nlevel],colour="orange",linetype="dashed",size=0.5)+
  ggtitle("Percentage of variance explained by different components")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.

4.5.3 PCA plot

dds$Filenames <- sub("*_151118_BC852HANXX_P2503_", "_", sub("*_sortmerna_trimmomatic","",basename(dirname(dds$Filenames))))
pc.dat <- bind_cols(PC1=pc$x[,1],
                    PC2=pc$x[,2],
                    as.data.frame(colData(dds)))

p <- ggplot(pc.dat,aes(x=PC1,y=PC2,col=Level,shape=Level,text=Filenames)) + 
  geom_point(size=2) + 
  ggtitle("Principal Component Analysis",subtitle="variance stabilized counts")

ggplotly(p) %>% 
  layout(xaxis=list(title=paste("PC1 (",percent[1],"%)",sep="")),
         yaxis=list(title=paste("PC2 (",percent[2],"%)",sep="")))
## Warning: The shape palette can deal with a maximum of 6 discrete values because
## more than 6 becomes difficult to discriminate; you have 8. Consider
## specifying shapes manually if you must have them.

👉 Some conditions clusterized better than others in the PCA plot

4.6 Sequencing depth

Number of genes expressed per condition at different cutoffs:

conds <- factor(dds$Level)
dev.null <- rangeSamplesSummary(counts=vst,
                                conditions=conds,
                                nrep=3)

4.7 Heatmap

Filter for noise

sels <- rangeFeatureSelect(counts=vst,
                           conditions=conds,
                           nrep=3) %>% 
  suppressWarnings()

vst.cutoff <- 2
abline(h=10000, col="Red", lty=3)

  • Set the cut off to 8 in order to retrieve less than 10 000 genes
vst.cutoff <- 8

hm_reduced <- heatmap.2(t(scale(t(vst[sels[[vst.cutoff+1]],]))),
                distfun=pearson.dist,
                hclustfun=function(X){hclust(X,method="ward.D2")},
                labRow = NA,trace = "none",
                labCol = conds,
                col=hpal)

plot(as.hclust(hm_reduced$colDendrogram),xlab="",sub="")

👉 The different conditions are not so different in gene expression level except the extreme conditions

4.8 Clustering of samples

Done to assess the previous dendrogram’s reproducibility

hm.pvclust <- pvclust(data = t(scale(t(vst[sels[[vst.cutoff+1]],]))),
                       method.hclust = "ward.D2", 
                       nboot = 1000, parallel = TRUE)
## Creating a temporary cluster...done:
## socket cluster with 15 nodes on host 'localhost'
## Multiscale bootstrap... Done.

Plot the clustering with bp and au

plot(hm.pvclust, labels = conds, cex.axis=1.5)
pvrect(hm.pvclust)

👉 Some conditions clusterize better than others

bootstrapping results as a table
print(hm.pvclust, digits=3)
## 
## Cluster method: ward.D2
## Distance      : correlation
## 
## Estimates on edges:
## 
##       si    au    bp se.si se.au se.bp      v      c  pchi
## 1  0.738 0.870 0.864 0.031 0.020 0.004 -1.114  0.014 0.563
## 2  0.794 0.918 0.810 0.025 0.013 0.004 -1.134  0.257 0.810
## 3  1.000 1.000 1.000 0.000 0.000 0.000  0.000  0.000 0.000
## 4  1.000 1.000 1.000 0.000 0.000 0.000  0.000  0.000 0.000
## 5  1.000 1.000 1.000 0.000 0.000 0.000  0.000  0.000 0.000
## 6  1.000 1.000 1.000 0.000 0.000 0.000  0.000  0.000 0.000
## 7  0.824 0.905 0.934 0.029 0.019 0.003 -1.410 -0.098 0.583
## 8  0.800 0.888 0.936 0.032 0.022 0.003 -1.369 -0.153 0.559
## 9  0.590 0.797 0.790 0.036 0.024 0.004 -0.819  0.014 0.548
## 10 0.978 0.986 0.998 0.029 0.020 0.001 -2.537 -0.337 0.793
## 11 1.000 1.000 1.000 0.000 0.000 0.000  0.000  0.000 0.000
## 12 1.000 1.000 1.000 0.000 0.000 0.000  0.000  0.000 0.000
## 13 1.000 1.000 1.000 0.000 0.000 0.000  0.000  0.000 0.000
## 14 0.740 0.827 0.964 0.046 0.037 0.002 -1.368 -0.426 0.890
## 15 0.659 0.834 0.819 0.033 0.022 0.004 -0.939  0.029 0.499
## 16 0.659 0.845 0.783 0.033 0.020 0.004 -0.899  0.118 0.952
## 17 0.030 0.533 0.495 0.035 0.031 0.005 -0.036  0.048 0.637
## 18 0.645 0.891 0.591 0.033 0.015 0.005 -0.730  0.500 0.196
## 19 0.000 0.538 0.443 0.000 0.031 0.005  0.023  0.119 0.328
## 20 0.276 0.834 0.304 0.053 0.021 0.005 -0.229  0.743 0.355
## 21 1.000 1.000 1.000 0.000 0.000 0.000  0.000  0.000 0.000
## 22 0.082 0.859 0.167 0.076 0.022 0.004 -0.055  1.020 0.424
## 23 0.564 0.747 0.858 0.039 0.030 0.004 -0.867 -0.203 0.428
## 24 0.564 0.747 0.858 0.039 0.030 0.004 -0.867 -0.203 0.428
## 25 0.571 0.752 0.858 0.038 0.029 0.004 -0.875 -0.194 0.398
## 26 1.000 1.000 1.000 0.000 0.000 0.000  0.000  0.000 0.000
## 27 1.000 1.000 1.000 0.000 0.000 0.000  0.000  0.000 0.000

 

5 Summary

The data are quite good so we can continue with our analysis

6 Session Info

Session Info
sessionInfo()
## R version 4.2.1 (2022-06-23)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.6 LTS
## 
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/libopenblasp-r0.2.20.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
##  [1] parallel  grid      stats4    stats     graphics  grDevices utils    
##  [8] datasets  methods   base     
## 
## other attached packages:
##  [1] emoji_15.0                  vsn_3.64.0                 
##  [3] tximport_1.24.0             forcats_0.5.2              
##  [5] stringr_1.4.1               dplyr_1.0.10               
##  [7] purrr_0.3.5                 readr_2.1.3                
##  [9] tidyr_1.2.1                 tibble_3.1.8               
## [11] tidyverse_1.3.2             pvclust_2.2-0              
## [13] plotly_4.10.1               magrittr_2.0.3             
## [15] hyperSpec_0.100.0           xml2_1.3.3                 
## [17] ggplot2_3.4.0               lattice_0.20-45            
## [19] here_1.0.1                  gplots_3.1.3               
## [21] DESeq2_1.36.0               SummarizedExperiment_1.26.1
## [23] Biobase_2.56.0              MatrixGenerics_1.8.1       
## [25] matrixStats_0.62.0          GenomicRanges_1.48.0       
## [27] GenomeInfoDb_1.32.4         IRanges_2.30.1             
## [29] S4Vectors_0.34.0            BiocGenerics_0.42.0        
## [31] data.table_1.14.6          
## 
## loaded via a namespace (and not attached):
##   [1] readxl_1.4.1           backports_1.4.1        lazyeval_0.2.2        
##   [4] splines_4.2.0          crosstalk_1.2.0        BiocParallel_1.30.4   
##   [7] digest_0.6.30          htmltools_0.5.3        fansi_1.0.3           
##  [10] memoise_2.0.1          googlesheets4_1.0.1    limma_3.52.4          
##  [13] tzdb_0.3.0             Biostrings_2.64.1      annotate_1.74.0       
##  [16] modelr_0.1.9           vroom_1.6.0            timechange_0.1.1      
##  [19] jpeg_0.1-9             colorspace_2.0-3       blob_1.2.3            
##  [22] rvest_1.0.3            haven_2.5.1            xfun_0.34             
##  [25] hexbin_1.28.2          crayon_1.5.2           RCurl_1.98-1.9        
##  [28] jsonlite_1.8.3         genefilter_1.78.0      survival_3.4-0        
##  [31] glue_1.6.2             gtable_0.3.1           gargle_1.2.1          
##  [34] zlibbioc_1.42.0        XVector_0.36.0         DelayedArray_0.22.0   
##  [37] scales_1.2.1           DBI_1.1.3              Rcpp_1.0.9            
##  [40] viridisLite_0.4.1      xtable_1.8-4           bit_4.0.4             
##  [43] preprocessCore_1.58.0  htmlwidgets_1.5.4      httr_1.4.4            
##  [46] RColorBrewer_1.1-3     ellipsis_0.3.2         farver_2.1.1          
##  [49] pkgconfig_2.0.3        XML_3.99-0.12          sass_0.4.2            
##  [52] dbplyr_2.2.1           deldir_1.0-6           locfit_1.5-9.6        
##  [55] utf8_1.2.2             labeling_0.4.2         tidyselect_1.2.0      
##  [58] rlang_1.0.6            AnnotationDbi_1.58.0   munsell_0.5.0         
##  [61] cellranger_1.1.0       tools_4.2.0            cachem_1.0.6          
##  [64] cli_3.4.1              generics_0.1.3         RSQLite_2.2.18        
##  [67] broom_1.0.1            evaluate_0.18          fastmap_1.1.0         
##  [70] yaml_2.3.6             knitr_1.40             bit64_4.0.5           
##  [73] fs_1.5.2               caTools_1.18.2         KEGGREST_1.36.3       
##  [76] brio_1.1.3             compiler_4.2.0         rstudioapi_0.14       
##  [79] png_0.1-7              testthat_3.1.5         affyio_1.66.0         
##  [82] reprex_2.0.2           geneplotter_1.74.0     bslib_0.4.1           
##  [85] stringi_1.7.8          highr_0.9              Matrix_1.5-1          
##  [88] vctrs_0.5.1            pillar_1.8.1           lifecycle_1.0.3       
##  [91] BiocManager_1.30.19    jquerylib_0.1.4        bitops_1.0-7          
##  [94] R6_2.5.1               latticeExtra_0.6-30    affy_1.74.0           
##  [97] KernSmooth_2.23-20     codetools_0.2-18       gtools_3.9.3          
## [100] assertthat_0.2.1       rprojroot_2.0.3        withr_2.5.0           
## [103] GenomeInfoDbData_1.2.8 hms_1.1.2              rmarkdown_2.18        
## [106] googledrive_2.0.0      lubridate_1.9.0        interp_1.1-3